{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(60, 2)"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from numpy import * \n",
    "\n",
    "# 数据导入\n",
    "def loadDataSet(fileName):\n",
    "    dataMat = []\n",
    "    fr = open(fileName)\n",
    "    for line in fr.readlines():\n",
    "        curLine = line.strip().split('\\t')\n",
    "        fltLine = list(map(float, curLine))\n",
    "        dataMat.append(fltLine)\n",
    "    return mat(dataMat)\n",
    "\n",
    "# 计算距离\n",
    "def distEclud(vecA, vecB):\n",
    "    return sqrt(sum(power(vecA-vecB, 2)))\n",
    "\n",
    "# 随机质心\n",
    "def randCent(dataSet, k):\n",
    "    n = shape(dataSet)[1]\n",
    "    centroids = mat(zeros((k, n)))\n",
    "    for j in range(n):\n",
    "        minJ = min(dataSet[:, j])\n",
    "        rangeJ = float(max(dataSet[:, j]) - minJ)\n",
    "        centroids[:, j] =  minJ + rangeJ * random.rand(k,1)\n",
    "    return centroids\n",
    "\n",
    "\n",
    "dataMat = loadDataSet('testSet2.txt')\n",
    "randCent(dataMat, 2)\n",
    "dataMat.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[-0.45965615, -2.7782156 ],\n",
       "        [-2.94737575,  3.3263781 ],\n",
       "        [ 2.93386365,  3.12782785]])"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "    k均值据类算法\n",
    "    算法会创建k个质心，然后将每个点分配到最近的质心，在重新计算质心。该过程重复执行，知道质心不再改变\n",
    "\"\"\"\n",
    "def KMeans(dataSet, k, distMeas=distEclud, createCent=randCent):\n",
    "    m = shape(dataSet)[0] \n",
    "    clusterAssment = mat(zeros((m, 2)))\n",
    "#     print(clusterAssment)\n",
    "    centroids = createCent(dataSet, k)\n",
    "    clusterChanged = True\n",
    "    while clusterChanged:\n",
    "        clusterChanged = False\n",
    "        for i in range(m):\n",
    "            minDist = inf; minIndex = -1\n",
    "            for j in range(k):\n",
    "                distJI = distMeas(centroids[j, :], dataSet[i, :])\n",
    "                if distJI < minDist:\n",
    "                    minDist = distJI; minIndex = j\n",
    "            if clusterAssment[i, 0] != minIndex:\n",
    "                clusterChanged = True\n",
    "            clusterAssment[i, :] = minIndex, minDist**2\n",
    "#         print(clusterAssment)\n",
    "        for cent in range(k):\n",
    "#             print(nonzero(clusterAssment[:,0].A == cent)[0])\n",
    "#             print((clusterAssment[:, 0].A==cent), type(clusterAssment[:,0].A==cent))\n",
    "            ptsInClust = dataSet[nonzero(clusterAssment[:, 0].A==cent)[0]]\n",
    "            centroids[cent, :] = mean(ptsInClust, axis=0)\n",
    "    return centroids, clusterAssment\n",
    "\n",
    "\n",
    "centroids, clusterAssment = KMeans(dataMat, 3)\n",
    "centroids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.collections.PathCollection at 0x20de81827b8>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXYAAAD8CAYAAABjAo9vAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAFC1JREFUeJzt3W+I3Vedx/HPN5M0Oya6Q5hZSiYZ03aXYLYRww61MA8MqZrWlhraB9uKIvggTxRasOlObGFdEBoIa32grAR3YaF/wkLj6JpqbI0RtpDiJNOaDTHSbWvaG8VkY6rdDM2ffvfBzLTz5/6/5/7O+Z37fkEg85ub+zv33tzPPfec7zk/c3cBAPKxLHYDAABhEewAkBmCHQAyQ7ADQGYIdgDIDMEOAJkh2AEgMwQ7AGSGYAeAzCyPcdLBwUHfsGFDjFMDQGkdO3bsvLsPNbpdlGDfsGGDJicnY5waAErLzH7bzO0YigGAzBDsAJAZgh0AMkOwA0BmCHYAyAzBDgCZiVLuWAYTUxXtPXRaZy9Oa+1Av3Zt36gdW4ZjNwtAolLKDIK9iompinYfOKHpK9ckSZWL09p94IQkEe4AlkgtMxiKqWLvodPvvUBzpq9c095DpyO1CEDKUssMgr2KsxenWzoOoLellhkEexVrB/pbOg6gt6WWGQT7IhNTFV26fHXJ8f4Vfdq1fWOEFgFI3a7tG9W/om/BsZiZUZrJ0yJmnBdPgMwZ6F+hr9/9t0ycAqhqLhuoimlBUTPO1SZAJGnVyuWEOoC6dmwZTiYnSjEUU9SMc2oTIADQjlIEe1GBm9oECAC0oxTBXlTgpjYBAhRtYqqisT2HdcP4QY3tOayJqUrsJqENpQj2ogJ3x5ZhPXbPZg0P9MskDQ/067F7NiczbgZ009xcVuXitFzvz2UR7uVTisnTImecU5oAAYpUby6L90S5lCLYJQIX6DaKB/JRiqEYAN1H8UA+CHbUxWRa76B4IB+lGYpB8VLbihTdldrqSbSPYEdNTKb1Huay8hBsKMbM+sxsysx+FOo+EVetSbMKk2lA0kL22B+QdErShwLeZxApXbKqTNYO9FcNcdPMc8pziFbxXixGkB67ma2TdKek74W4v5BYdNG+Xds3yqocd4mrSaFlvBeLE2oo5luSHpb0bqD7Cya1S1aVqcpkx5ZheY3fUduMVqX2XsxZx8FuZndJ+oO7H2twu51mNmlmk+fOnev0tE1LadFFGXssw9Q2I5CU3ou5C9FjH5N0t5m9Lmm/pG1m9sTiG7n7PncfdffRoaGhAKdtTkqLLsrYY6G2GaGk9F4sQsxv5x0Hu7vvdvd17r5B0n2SDrv75ztuWSApBVMZeyxsjIZQUnovdlvsb+fZ17GntOiiVpVJ6j0WapsRQkrvxW6LvQYkaLC7+xFJR0LeZwipBNOu7RuXXFM11x4LUE0q78Vui/3tnL1iCsSwBtAbYs8nZD8Uk5p2eiws6gDKJfa3c4I9cUVtxMWHR+8q6rUvw/+xUG2MPZ9g7rWWoHTP6OioT05OFn7eMjp+48d0+eo13fe5PQuODw/064XxbUHOsfjDQ5rpXTBMlL+iXvtG50kh9Lv5XIR6fGZ2zN1HG92OMfbEXb56rerxkJMwZayvRxhFvfb1zhO7NLCZNnYixuMj2BN33fK+qsdDTsKwi2NvqLZgpqjqjXrnSaVj0a3nIsbjI9gTN7KmX8ts4VZcoSdhan1IzO3iiDTUWsnYzArHWr3Gv+xfUfVcoas36lWJxC4NnN+WVo43K8bjI9gTN7h6pW4cWtXVEkl2cUxfrWB+dOJEU1/za/UazVTIatB6q04XB+f+p8a1/6nxwhfudWtlbIzSR4K9BAZXr9QL49v02p479cL4tuCTSuzimL5awfz0i2809TW/1ut48dKVQtZW1FvDUS1Ql5kVvnCvW+tMYmylQLljKrZurX78F7+o//sjR4Kcfrik2x30ilrBfK1GVdvi29fbzqKo1aC1zrO4NPC65X0aWdOvWyJUZHXjuYhR+kiwQ1L8BRWor1Yw95lVDffFH8ipv74LAvXo3riN6YKit1Ig2FNRq+c911MP1DOvJfaCik6kUAPdbbWC+d6/G9YzxyoNA7vMry9aR7DjPWXcoKmolbmx1Qvm0Q+vaSqwy/L6nn/7HZ25MK17xw/yAdQmVp6mrqAee1mN7TlcdYgi5MpcdEmVeaPzb7+jwWNHJUlH198saWYi9cahVRpcvXLmRj38Xmh25Sk9dnQs5lBIKjXQCOPMhWkNLjr2rvvM8blgR0MEOzoSeyikrBcvgar2vO8dP6innxqXpAX7I5mk1/bcueC2vTC30i7q2BPx6MQJ3bT7WW0YP6ibdj+rRydmwvH82+/o+JmLUa6b2IzYy8F76XJrvaDZxTyp7C+TKnrsgXTSe3h04oSeOHrmvZ+vueuJo2f02rm3dfyOf0x6YjD2UAjVHnnZtX2jlv2L6d15c3/VPqhjX3oudQR7AJ0ORzz94htVj7/wPxeWHEvtP28KQyFlqfZAYzu2DOv80CqduTAtk2p+UMfuUKSOYA+g095DrdWDtaT0nzf1hS8on8HVKzW4euWSMfX5UuhQpIwx9gA67T30WbUtuGpL6T8v13FFDMyt1EePPYBOew/3f3z9gjH2OWM3rdHxM28l3xtmKARBNVGnztxKfQR7AM0MR9SbXP3Gjs2SZsbar7mrz0z3f3y9vrFjMyVdQA10KGpj5Wkg9QKYa4oCCIGVpwWr13soS2kW3w6A95X5/UCwF6AMpVmxV5ACKSn7+4GqmALEuDRWq2KvIAVSUvb3A8FegDKUZpXhWwXia+bC2Tko+/uBYC9AGWq9y/CtAnH10v4sZX8/MMZekNRLs1hBikbKUgQQQtnfDwQ7JLHgA42VfXiiFWV/PxDseE/q3yoQV6/tz1Lm9wNj7ACaErMIoFcmbUOhxw6gKbGGJ0LVlJd5wVGrCHYATYsxPBFi0rbsC45a1fFQjJmtN7Ofm9kpMztpZg+EaBgASGEmbcu+4KhVIcbYr0r6qrt/RNKtkr5sZpsC3C8ABKkprzbpK+VZ0SMFCHZ3/527H5/9+58lnZKU33cbAFF0Omk7MVVRrUvZ5FrRE3SM3cw2SNoi6cWQ9wugd3U6abv30GlV25zcpNIsOGpVsGA3s9WSnpH0oLv/qcrvd0raKUkjIyOhTgugB3QyaVtruMWV58SpFKiO3cxWaCbUn3T3A9Vu4+773H3U3UeHhoZCnBYAGqo13DKc6TCMFKYqxiT9q6RT7v7NzpsEAOGUYXfV0EL02MckfUHSNjN7afbPZwLcLwB0rAy7q4bW8Ri7u/+XVHPSGQCiK/O+L+1grxgAyAxbChSsl/arABAHwV6gXtuvAkAcDMUUqNf2qwAQB8FeoF66Ag2AeAj2ApX9ArkAyoFgbyDklVt6caEEgOIxeVpH6MnOsl8gF0A5EOx1hLhyy2K9tlACQPEYiqmDyU4AZUSw18FkJ4AyItjrYLITQBkxxl4Hk50Ayohgb4DJTgBlw1AMAGSGYAeAzBDsAJAZgh0AMkOwA0BmCHYAyAzBDgCZIdgBIDMEOwBkhmAHgMwQ7ACQGfaKARDMxFSFTfMSQLADCCL0pSTRPoZiAARR71KSKBbBDiAILiWZDoIdQBBcSjIdBDuAILiUZDqYPAUQBJeSTAfBDiAYLiWZBoZiACAzQYLdzG43s9Nm9oqZjYe4TwBAezoOdjPrk/QdSXdI2iTpfjPb1On9AgDaE6LHfoukV9z9VXe/LGm/pM8GuF8AQBtCBPuwpDfm/fzm7DEAQAQhgt2qHPMlNzLbaWaTZjZ57ty5AKcFAFQTItjflLR+3s/rJJ1dfCN33+fuo+4+OjQ0FOC0AIBqQgT7LyX9jZndYGbXSbpP0g8D3C8AoA0dL1By96tm9hVJhyT1Sfo3dz/ZccsAAG0JsvLU3Z+V9GyI+wIAdIaVpwCQGYIdADJDsANAZgh2AMgMwQ4AmSHYASAzBDsAZIZgB4DMEOwAkBmCHQAyQ7ADQGaC7BUDhDYxVdHeQ6d19uK01g70a9f2jdqxheu3AM0g2JGciamKdh84oekr1yRJlYvT2n3ghCQR7kATGIpBcvYeOv1eqM+ZvnJNew+djtQioFwIdiTn7MXplo4DWIhgR3LWDvS3dBzAQgQ7krNr+0b1r+hbcKx/RZ92bd8YqUVAuTB5iuTMTZBSFQO0h2BHknZsGSbIgTYR7AiG2nMgDQQ7gqD2HEgHk6cIgtpzIB0EO4Kg9hxIB8GOIKg9B9JBsCOImLXnE1MVje05rBvGD2psz2FNTFW6fk4gZUyeIohYtedM2gJLEewIJkbteb1JW4IdvYqhGJQak7bAUgQ7So1JW2Apgh2lxoZhwFKMsaPU2DAMWIpgR+mxYRiwEEMxAJAZgh0AMkOwA0BmOgp2M9trZr82s1+Z2ffNbCBUwwAA7em0x/6cpJvd/aOSfiNpd+dNAgB0oqNgd/efuvvV2R+PSlrXeZMAAJ0IOcb+JUk/Dnh/AIA2NKxjN7PnJV1f5VePuPsPZm/ziKSrkp6scz87Je2UpJGRkbYaCwBorGGwu/sn6/3ezL4o6S5Jt7m717mffZL2SdLo6GjN2yFtXLAaSF9HK0/N7HZJ/yDpE+5+KUyTkCr2PgfKodMx9m9L+qCk58zsJTP7boA2IVFcsBooh4567O7+16EagvSx9zlQDmwChqatHehXpUqIs/f5DOYfkAq2FEDT2Pu8trn5h8rFabnen3/gwtqIgR47GprfEx34wAqtXL5Mb01fid4rTamHzLVXkRKCHXUtroT546Ur6l/Rp8f//mNRAyu1Ch3mH5AShmJQV6qVME23a+vWmT9dxrVXkRKCHXWl2hNtpl0TUxUdP3NRR1/9X43tOdzV8W7mH5ASgh11pdoTbdSuuaGay1cXDtV0K9x3bBnWY/ds1vBAv0zS8EC/HrtnM+PriIIxdtS1a/vGBWPZUho90UbtijGZybVXkQqCHXXNBVUq1SfNtivVISSgCAQ7Gkq1J1qtXXMlkLV2mYs9hAQUgWBHslqqU9+6Vefffkdrz/2f/nneJqO3vvHfkqT9T41rmZluHFol/WTl+//uyJEuPgIgDoIdSWqnTv3MhWm9W2Pn6OuW92lkTb8GV6+s+nsgJwQ7ktTy5OeRI7p3/OCSIZj9T41Lkm599aUutRRID8GOYEIu8W9n8rPWJmXXLe+rcuu4UtoOAfmhjh1BhN4Eq536+WqLhJaZaWRNcxOmE1MVje05rBvGD3Z1QRMbhqHbCHYEEXrrgXZWclZbJHTj0KqmxtWLDNtUt2lAPhiKQRCh68bbrZ9fUgL5k+YmS4tc0ESNPbqNYEcQ3bgIR5H180WGLRcsQbcxFIMgkt0E68iRpmrVi9wTJ9nnCtkg2BFE2TfBKjJsy/5cIX3mNRZ0dNPo6KhPTk4Wfl6kI8VyvxTbBMxnZsfcfbTR7RhjR+FSu/rRnHbH9PlAQGoIdhQup+uDNvshRfijSAQ7CpdTuV8zH1KNwp/QR2gEOwqXU7lfMx9SjRYkpTgshXKjKgaFy6ncr5kyyXrhzypUdAPBjsLlVO7XzIdUvfDPaVgK6WAoBlGkelWmVjWz9UG967P+03+e1B8vXVlyvwMfWNH9xiNbBDvQoUYfUvXC/+s/PFn130RYXoKMEOxAAWqF/1vTS3vr9Y4DzWCMHYioyD1q0DsIdmSlqItlhJJThRDSwVAMspHqVgX1tLvvPFAPwY5slHWrglwqhJCOIEMxZvaQmbmZDYa4P6Ad1IQDMzoOdjNbL+lTks503hygfUxEAjNC9Ngfl/SwJCpvERUTkcCMjsbYzexuSRV3f9nMAjUJaA8TkcCMhsFuZs9Lur7Krx6R9DVJn27mRGa2U9JOSRoZGWmhiUDzmIgEOrg0npltlvQzSZdmD62TdFbSLe7++3r/lkvjIWXsj45Udf3SeO5+QtJfzTvh65JG3f18u/cJxFbGWnhgMVaeAvOwPzpyEGyBkrtvCHVfQCzUwiMH9NiBeaiFRw4IdmAeauGRA/aKAeahFh45INiBRaiFR9kxFAMAmSHYASAzBDsAZIZgB4DMEOwAkJm2NwHr6KRm5yT9tvATt25QUi/vfdPrj1/iOej1xy+l9Rx82N2HGt0oSrCXhZlNNrOTWq56/fFLPAe9/vilcj4HDMUAQGYIdgDIDMFe377YDYis1x+/xHPQ649fKuFzwBg7AGSGHjsAZIZgb5KZPWRmbmaDsdtSJDPba2a/NrNfmdn3zWwgdpuKYGa3m9lpM3vFzMZjt6doZrbezH5uZqfM7KSZPRC7TTGYWZ+ZTZnZj2K3pRUEexPMbL2kT0k6E7stETwn6WZ3/6ik30jaHbk9XWdmfZK+I+kOSZsk3W9mm+K2qnBXJX3V3T8i6VZJX+7B50CSHpB0KnYjWkWwN+dxSQ9L6rkJCXf/qbtfnf3xqKR1MdtTkFskveLur7r7ZUn7JX02cpsK5e6/c/fjs3//s2bCraf2MjazdZLulPS92G1pFcHegJndLani7i/HbksCviTpx7EbUYBhSW/M+/lN9ViozWdmGyRtkfRi3JYU7lua6dC9G7shreJCG5LM7HlJ11f51SOSvibp08W2qFj1Hr+7/2D2No9o5uv5k0W2LRKrcqznvq1JkpmtlvSMpAfd/U+x21MUM7tL0h/c/ZiZbY3dnlYR7JLc/ZPVjpvZZkk3SHrZzKSZYYjjZnaLu/++wCZ2Va3HP8fMvijpLkm3eW/Ux74paf28n9dJOhupLdGY2QrNhPqT7n4gdnsKNibpbjP7jKS/kPQhM3vC3T8fuV1NoY69BWb2uqRRd09lQ6CuM7PbJX1T0ifc/Vzs9hTBzJZrZqL4NkkVSb+U9Dl3Pxm1YQWymZ7Mv0u64O4Pxm5PTLM99ofc/a7YbWkWY+xo5NuSPijpOTN7ycy+G7tB3TY7WfwVSYc0M2n4H70U6rPGJH1B0rbZ1/2l2d4rSoAeOwBkhh47AGSGYAeAzBDsAJAZgh0AMkOwA0BmCHYAyAzBDgCZIdgBIDP/D6hKsSLCa20iAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# dataMat.A\n",
    "plt.scatter(dataMat.A[:, 0], dataMat.A[:, 1])\n",
    "plt.scatter(centroids.A[:, 0], centroids.A[:, 1], marker='+', c='red', s=200)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 避免聚类收敛到局部最小\n",
    "* 合并最近的质心\n",
    "* 合并两个使得SSE增幅最小的质心"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[-0.15772275000000002, 1.2253301166666664]"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mean(dataMat, axis=0).tolist()[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sseSplit and notSplit 570.7227574246755 0.0\n",
      "len(centList): 1, bestCentToSplit:0\n",
      "The bestCentToSplit is : 0\n",
      "The len os bestClustAss is : 60\n",
      "The bestNewCents: [[ 1.23710375  0.17480612]\n",
      " [-2.94737575  3.3263781 ]]\n",
      "The bestNewCents[:, 0] : [[1.23710375 0.17480612]]\n",
      "centList:  [[-0.15772275000000002, 1.2253301166666664]]\n",
      "\n",
      "centList:  [matrix([[1.23710375, 0.17480612]])]\n",
      "sseSplit and notSplit 68.68654812621844 38.06295063565756\n",
      "sseSplit and notSplit 22.971771896318412 532.6598067890178\n",
      "len(centList): 2, bestCentToSplit:0\n",
      "The bestCentToSplit is : 0\n",
      "The len os bestClustAss is : 40\n",
      "The bestNewCents: [[ 2.93386365  3.12782785]\n",
      " [-0.45965615 -2.7782156 ]]\n",
      "The bestNewCents[:, 0] : [[2.93386365 3.12782785]]\n",
      "centList:  [matrix([[1.23710375, 0.17480612]]), matrix([[-2.94737575,  3.3263781 ]])]\n",
      "\n",
      "centList:  [matrix([[2.93386365, 3.12782785]]), matrix([[-2.94737575,  3.3263781 ]])]\n",
      "centroids:  [matrix([[2.93386365, 3.12782785]]), matrix([[-2.94737575,  3.3263781 ]]), matrix([[-0.45965615, -2.7782156 ]])]\n",
      "clusterAssment:  [[0.00000000e+00 1.45461050e-01]\n",
      " [1.00000000e+00 6.80213825e-01]\n",
      " [2.00000000e+00 1.02184582e+00]\n",
      " [0.00000000e+00 1.34548760e+00]\n",
      " [1.00000000e+00 1.35376464e+00]\n",
      " [2.00000000e+00 3.87167519e+00]\n",
      " [0.00000000e+00 8.37259951e-01]\n",
      " [1.00000000e+00 2.20116272e-01]\n",
      " [2.00000000e+00 3.53809057e+00]\n",
      " [0.00000000e+00 7.44081160e+00]\n",
      " [1.00000000e+00 5.28070040e+00]\n",
      " [2.00000000e+00 2.56674394e-02]\n",
      " [0.00000000e+00 1.11946529e+00]\n",
      " [1.00000000e+00 1.67890884e-01]\n",
      " [2.00000000e+00 2.11734245e+00]\n",
      " [0.00000000e+00 1.49635209e+00]\n",
      " [1.00000000e+00 4.93628241e+00]\n",
      " [2.00000000e+00 9.76749869e-03]\n",
      " [0.00000000e+00 1.32453845e-01]\n",
      " [1.00000000e+00 6.39346045e-01]\n",
      " [2.00000000e+00 9.41791924e-01]\n",
      " [0.00000000e+00 1.72445523e+00]\n",
      " [1.00000000e+00 7.50682798e-01]\n",
      " [2.00000000e+00 1.48785604e-01]\n",
      " [0.00000000e+00 3.00429548e+00]\n",
      " [1.00000000e+00 5.15437527e+00]\n",
      " [2.00000000e+00 1.80316434e+00]\n",
      " [0.00000000e+00 2.74825782e+00]\n",
      " [1.00000000e+00 4.66860313e-01]\n",
      " [2.00000000e+00 1.28807718e+00]\n",
      " [0.00000000e+00 1.76804356e+00]\n",
      " [1.00000000e+00 3.54002368e+00]\n",
      " [2.00000000e+00 2.12516750e+00]\n",
      " [0.00000000e+00 1.14812052e+00]\n",
      " [1.00000000e+00 1.78247878e+00]\n",
      " [2.00000000e+00 8.79445646e-01]\n",
      " [0.00000000e+00 3.23315472e+00]\n",
      " [1.00000000e+00 7.43934371e-01]\n",
      " [2.00000000e+00 2.36276631e+00]\n",
      " [0.00000000e+00 2.59370616e-01]\n",
      " [1.00000000e+00 1.82015977e+00]\n",
      " [2.00000000e+00 2.10599050e+00]\n",
      " [0.00000000e+00 2.94567602e+00]\n",
      " [1.00000000e+00 2.49952822e+00]\n",
      " [2.00000000e+00 1.54957269e+00]\n",
      " [0.00000000e+00 9.45169633e-01]\n",
      " [1.00000000e+00 2.91966903e+00]\n",
      " [2.00000000e+00 1.13851139e+00]\n",
      " [0.00000000e+00 5.09476462e+00]\n",
      " [1.00000000e+00 1.64971118e+00]\n",
      " [2.00000000e+00 1.98934951e-01]\n",
      " [0.00000000e+00 1.50301593e+00]\n",
      " [1.00000000e+00 2.13359760e-01]\n",
      " [2.00000000e+00 2.16005416e+00]\n",
      " [0.00000000e+00 2.63462894e+00]\n",
      " [1.00000000e+00 7.60898177e-02]\n",
      " [2.00000000e+00 2.60198288e-01]\n",
      " [0.00000000e+00 3.05416591e-03]\n",
      " [1.00000000e+00 3.16776316e+00]\n",
      " [2.00000000e+00 1.61040000e+00]]\n"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "    二分K 均值算法\n",
    "    将所有点作为一个簇，然后将该簇一分为二。之后选择其中一个簇（可以最大程度降低SSE（Sum of Square Error:误差平方和））\n",
    "    ，上述基于SSE不断重复，直到得到指定数目的簇\n",
    "\"\"\"\n",
    "def biKmeans(dataSet, k, distMeans=distEclud):\n",
    "    m = shape(dataSet)[0]\n",
    "    clusterAssment = mat(zeros((m,2)))\n",
    "    centroid0 = mean(dataSet, axis=0).tolist()[0]\n",
    "#     print(centroid0)\n",
    "    centList = [centroid0]\n",
    "#     print(centList)\n",
    "    for j in range(m):\n",
    "        clusterAssment[j, 1] = distMeans(mat(centroid0), dataSet[j, :]) ** 2\n",
    "#     print(clusterAssment)\n",
    "    while (len(centList) < k):\n",
    "        lowestSSE = inf\n",
    "        for i in range(len(centList)):\n",
    "            ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0], :]\n",
    "            centroidMat, splitClustAss = KMeans(ptsInCurrCluster, 2, distMeans)\n",
    "            sseSplit = sum(splitClustAss[:,1])\n",
    "            sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0], 1])\n",
    "            print(\"sseSplit and notSplit\", sseSplit, sseNotSplit)\n",
    "            if (sseSplit + sseNotSplit) < lowestSSE:\n",
    "                bestCentToSplit = i\n",
    "                bestNewCents = centroidMat\n",
    "                bestClustAss = splitClustAss.copy()\n",
    "                lowestSSE = sseSplit + sseNotSplit\n",
    "#         print(bestClustAss)\n",
    "        bestClustAss[nonzero(bestClustAss[:, 0].A==1)[0], 0] = len(centList)\n",
    "        bestClustAss[nonzero(bestClustAss[:, 0].A==0)[0], 0] = bestCentToSplit\n",
    "        print('len(centList): %s, bestCentToSplit:%s' % (len(centList), bestCentToSplit))\n",
    "        print(\"The bestCentToSplit is :\", bestCentToSplit)\n",
    "        print(\"The len os bestClustAss is :\", len(bestClustAss))\n",
    "        print(\"The bestNewCents:\", bestNewCents)\n",
    "        print(\"The bestNewCents[:, 0] :\", bestNewCents[0, :])\n",
    "        print(\"centList: \", centList)\n",
    "        print(\"\")\n",
    "        \n",
    "        centList[bestCentToSplit] = bestNewCents[0,:]\n",
    "        print(\"centList: \", centList)\n",
    "#         print(clusterAssment)\n",
    "        centList.append(bestNewCents[1, :])\n",
    "        clusterAssment[nonzero(clusterAssment[:,0].A==bestCentToSplit)[0], :] = bestClustAss\n",
    "    return centList, clusterAssment\n",
    "        \n",
    "\n",
    "centroids, clusterAssment = biKmeans(dataMat, 3)\n",
    "print(\"centroids: \", centroids)\n",
    "print(\"clusterAssment: \", clusterAssment)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[matrix([[2.93386365, 3.12782785]]), matrix([[-2.94737575,  3.3263781 ]]), matrix([[-0.45965615, -2.7782156 ]])]\n",
      "[2.93386365, -2.94737575, -0.45965614999999993] [3.12782785, 3.3263781000000003, -2.7782156000000002]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<matplotlib.collections.PathCollection at 0x20de81efda0>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXYAAAD8CAYAAABjAo9vAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAFC1JREFUeJzt3W+I3Vedx/HPN5M0Oya6Q5hZSiYZ03aXYLYRww61MA8MqZrWlhraB9uKIvggTxRasOlObGFdEBoIa32grAR3YaF/wkLj6JpqbI0RtpDiJNOaDTHSbWvaG8VkY6rdDM2ffvfBzLTz5/6/5/7O+Z37fkEg85ub+zv33tzPPfec7zk/c3cBAPKxLHYDAABhEewAkBmCHQAyQ7ADQGYIdgDIDMEOAJkh2AEgMwQ7AGSGYAeAzCyPcdLBwUHfsGFDjFMDQGkdO3bsvLsPNbpdlGDfsGGDJicnY5waAErLzH7bzO0YigGAzBDsAJAZgh0AMkOwA0BmCHYAyAzBDgCZiVLuWAYTUxXtPXRaZy9Oa+1Av3Zt36gdW4ZjNwtAolLKDIK9iompinYfOKHpK9ckSZWL09p94IQkEe4AlkgtMxiKqWLvodPvvUBzpq9c095DpyO1CEDKUssMgr2KsxenWzoOoLellhkEexVrB/pbOg6gt6WWGQT7IhNTFV26fHXJ8f4Vfdq1fWOEFgFI3a7tG9W/om/BsZiZUZrJ0yJmnBdPgMwZ6F+hr9/9t0ycAqhqLhuoimlBUTPO1SZAJGnVyuWEOoC6dmwZTiYnSjEUU9SMc2oTIADQjlIEe1GBm9oECAC0oxTBXlTgpjYBAhRtYqqisT2HdcP4QY3tOayJqUrsJqENpQj2ogJ3x5ZhPXbPZg0P9MskDQ/067F7NiczbgZ009xcVuXitFzvz2UR7uVTisnTImecU5oAAYpUby6L90S5lCLYJQIX6DaKB/JRiqEYAN1H8UA+CHbUxWRa76B4IB+lGYpB8VLbihTdldrqSbSPYEdNTKb1Huay8hBsKMbM+sxsysx+FOo+EVetSbMKk2lA0kL22B+QdErShwLeZxApXbKqTNYO9FcNcdPMc8pziFbxXixGkB67ma2TdKek74W4v5BYdNG+Xds3yqocd4mrSaFlvBeLE2oo5luSHpb0bqD7Cya1S1aVqcpkx5ZheY3fUduMVqX2XsxZx8FuZndJ+oO7H2twu51mNmlmk+fOnev0tE1LadFFGXssw9Q2I5CU3ou5C9FjH5N0t5m9Lmm/pG1m9sTiG7n7PncfdffRoaGhAKdtTkqLLsrYY6G2GaGk9F4sQsxv5x0Hu7vvdvd17r5B0n2SDrv75ztuWSApBVMZeyxsjIZQUnovdlvsb+fZ17GntOiiVpVJ6j0WapsRQkrvxW6LvQYkaLC7+xFJR0LeZwipBNOu7RuXXFM11x4LUE0q78Vui/3tnL1iCsSwBtAbYs8nZD8Uk5p2eiws6gDKJfa3c4I9cUVtxMWHR+8q6rUvw/+xUG2MPZ9g7rWWoHTP6OioT05OFn7eMjp+48d0+eo13fe5PQuODw/064XxbUHOsfjDQ5rpXTBMlL+iXvtG50kh9Lv5XIR6fGZ2zN1HG92OMfbEXb56rerxkJMwZayvRxhFvfb1zhO7NLCZNnYixuMj2BN33fK+qsdDTsKwi2NvqLZgpqjqjXrnSaVj0a3nIsbjI9gTN7KmX8ts4VZcoSdhan1IzO3iiDTUWsnYzArHWr3Gv+xfUfVcoas36lWJxC4NnN+WVo43K8bjI9gTN7h6pW4cWtXVEkl2cUxfrWB+dOJEU1/za/UazVTIatB6q04XB+f+p8a1/6nxwhfudWtlbIzSR4K9BAZXr9QL49v02p479cL4tuCTSuzimL5awfz0i2809TW/1ut48dKVQtZW1FvDUS1Ql5kVvnCvW+tMYmylQLljKrZurX78F7+o//sjR4Kcfrik2x30ilrBfK1GVdvi29fbzqKo1aC1zrO4NPC65X0aWdOvWyJUZHXjuYhR+kiwQ1L8BRWor1Yw95lVDffFH8ipv74LAvXo3riN6YKit1Ig2FNRq+c911MP1DOvJfaCik6kUAPdbbWC+d6/G9YzxyoNA7vMry9aR7DjPWXcoKmolbmx1Qvm0Q+vaSqwy/L6nn/7HZ25MK17xw/yAdQmVp6mrqAee1mN7TlcdYgi5MpcdEmVeaPzb7+jwWNHJUlH198saWYi9cahVRpcvXLmRj38Xmh25Sk9dnQs5lBIKjXQCOPMhWkNLjr2rvvM8blgR0MEOzoSeyikrBcvgar2vO8dP6innxqXpAX7I5mk1/bcueC2vTC30i7q2BPx6MQJ3bT7WW0YP6ibdj+rRydmwvH82+/o+JmLUa6b2IzYy8F76XJrvaDZxTyp7C+TKnrsgXTSe3h04oSeOHrmvZ+vueuJo2f02rm3dfyOf0x6YjD2UAjVHnnZtX2jlv2L6d15c3/VPqhjX3oudQR7AJ0ORzz94htVj7/wPxeWHEvtP28KQyFlqfZAYzu2DOv80CqduTAtk2p+UMfuUKSOYA+g095DrdWDtaT0nzf1hS8on8HVKzW4euWSMfX5UuhQpIwx9gA67T30WbUtuGpL6T8v13FFDMyt1EePPYBOew/3f3z9gjH2OWM3rdHxM28l3xtmKARBNVGnztxKfQR7AM0MR9SbXP3Gjs2SZsbar7mrz0z3f3y9vrFjMyVdQA10KGpj5Wkg9QKYa4oCCIGVpwWr13soS2kW3w6A95X5/UCwF6AMpVmxV5ACKSn7+4GqmALEuDRWq2KvIAVSUvb3A8FegDKUZpXhWwXia+bC2Tko+/uBYC9AGWq9y/CtAnH10v4sZX8/MMZekNRLs1hBikbKUgQQQtnfDwQ7JLHgA42VfXiiFWV/PxDseE/q3yoQV6/tz1Lm9wNj7ACaErMIoFcmbUOhxw6gKbGGJ0LVlJd5wVGrCHYATYsxPBFi0rbsC45a1fFQjJmtN7Ofm9kpMztpZg+EaBgASGEmbcu+4KhVIcbYr0r6qrt/RNKtkr5sZpsC3C8ABKkprzbpK+VZ0SMFCHZ3/527H5/9+58lnZKU33cbAFF0Omk7MVVRrUvZ5FrRE3SM3cw2SNoi6cWQ9wugd3U6abv30GlV25zcpNIsOGpVsGA3s9WSnpH0oLv/qcrvd0raKUkjIyOhTgugB3QyaVtruMWV58SpFKiO3cxWaCbUn3T3A9Vu4+773H3U3UeHhoZCnBYAGqo13DKc6TCMFKYqxiT9q6RT7v7NzpsEAOGUYXfV0EL02MckfUHSNjN7afbPZwLcLwB0rAy7q4bW8Ri7u/+XVHPSGQCiK/O+L+1grxgAyAxbChSsl/arABAHwV6gXtuvAkAcDMUUqNf2qwAQB8FeoF66Ag2AeAj2ApX9ArkAyoFgbyDklVt6caEEgOIxeVpH6MnOsl8gF0A5EOx1hLhyy2K9tlACQPEYiqmDyU4AZUSw18FkJ4AyItjrYLITQBkxxl4Hk50Ayohgb4DJTgBlw1AMAGSGYAeAzBDsAJAZgh0AMkOwA0BmCHYAyAzBDgCZIdgBIDMEOwBkhmAHgMwQ7ACQGfaKARDMxFSFTfMSQLADCCL0pSTRPoZiAARR71KSKBbBDiAILiWZDoIdQBBcSjIdBDuAILiUZDqYPAUQBJeSTAfBDiAYLiWZBoZiACAzQYLdzG43s9Nm9oqZjYe4TwBAezoOdjPrk/QdSXdI2iTpfjPb1On9AgDaE6LHfoukV9z9VXe/LGm/pM8GuF8AQBtCBPuwpDfm/fzm7DEAQAQhgt2qHPMlNzLbaWaTZjZ57ty5AKcFAFQTItjflLR+3s/rJJ1dfCN33+fuo+4+OjQ0FOC0AIBqQgT7LyX9jZndYGbXSbpP0g8D3C8AoA0dL1By96tm9hVJhyT1Sfo3dz/ZccsAAG0JsvLU3Z+V9GyI+wIAdIaVpwCQGYIdADJDsANAZgh2AMgMwQ4AmSHYASAzBDsAZIZgB4DMEOwAkBmCHQAyQ7ADQGaC7BUDhDYxVdHeQ6d19uK01g70a9f2jdqxheu3AM0g2JGciamKdh84oekr1yRJlYvT2n3ghCQR7kATGIpBcvYeOv1eqM+ZvnJNew+djtQioFwIdiTn7MXplo4DWIhgR3LWDvS3dBzAQgQ7krNr+0b1r+hbcKx/RZ92bd8YqUVAuTB5iuTMTZBSFQO0h2BHknZsGSbIgTYR7AiG2nMgDQQ7gqD2HEgHk6cIgtpzIB0EO4Kg9hxIB8GOIKg9B9JBsCOImLXnE1MVje05rBvGD2psz2FNTFW6fk4gZUyeIohYtedM2gJLEewIJkbteb1JW4IdvYqhGJQak7bAUgQ7So1JW2Apgh2lxoZhwFKMsaPU2DAMWIpgR+mxYRiwEEMxAJAZgh0AMkOwA0BmOgp2M9trZr82s1+Z2ffNbCBUwwAA7em0x/6cpJvd/aOSfiNpd+dNAgB0oqNgd/efuvvV2R+PSlrXeZMAAJ0IOcb+JUk/Dnh/AIA2NKxjN7PnJV1f5VePuPsPZm/ziKSrkp6scz87Je2UpJGRkbYaCwBorGGwu/sn6/3ezL4o6S5Jt7m717mffZL2SdLo6GjN2yFtXLAaSF9HK0/N7HZJ/yDpE+5+KUyTkCr2PgfKodMx9m9L+qCk58zsJTP7boA2IVFcsBooh4567O7+16EagvSx9zlQDmwChqatHehXpUqIs/f5DOYfkAq2FEDT2Pu8trn5h8rFabnen3/gwtqIgR47GprfEx34wAqtXL5Mb01fid4rTamHzLVXkRKCHXUtroT546Ur6l/Rp8f//mNRAyu1Ch3mH5AShmJQV6qVME23a+vWmT9dxrVXkRKCHXWl2hNtpl0TUxUdP3NRR1/9X43tOdzV8W7mH5ASgh11pdoTbdSuuaGay1cXDtV0K9x3bBnWY/ds1vBAv0zS8EC/HrtnM+PriIIxdtS1a/vGBWPZUho90UbtijGZybVXkQqCHXXNBVUq1SfNtivVISSgCAQ7Gkq1J1qtXXMlkLV2mYs9hAQUgWBHslqqU9+6Vefffkdrz/2f/nneJqO3vvHfkqT9T41rmZluHFol/WTl+//uyJEuPgIgDoIdSWqnTv3MhWm9W2Pn6OuW92lkTb8GV6+s+nsgJwQ7ktTy5OeRI7p3/OCSIZj9T41Lkm599aUutRRID8GOYEIu8W9n8rPWJmXXLe+rcuu4UtoOAfmhjh1BhN4Eq536+WqLhJaZaWRNcxOmE1MVje05rBvGD3Z1QRMbhqHbCHYEEXrrgXZWclZbJHTj0KqmxtWLDNtUt2lAPhiKQRCh68bbrZ9fUgL5k+YmS4tc0ESNPbqNYEcQ3bgIR5H180WGLRcsQbcxFIMgkt0E68iRpmrVi9wTJ9nnCtkg2BFE2TfBKjJsy/5cIX3mNRZ0dNPo6KhPTk4Wfl6kI8VyvxTbBMxnZsfcfbTR7RhjR+FSu/rRnHbH9PlAQGoIdhQup+uDNvshRfijSAQ7CpdTuV8zH1KNwp/QR2gEOwqXU7lfMx9SjRYkpTgshXKjKgaFy6ncr5kyyXrhzypUdAPBjsLlVO7XzIdUvfDPaVgK6WAoBlGkelWmVjWz9UG967P+03+e1B8vXVlyvwMfWNH9xiNbBDvQoUYfUvXC/+s/PFn130RYXoKMEOxAAWqF/1vTS3vr9Y4DzWCMHYioyD1q0DsIdmSlqItlhJJThRDSwVAMspHqVgX1tLvvPFAPwY5slHWrglwqhJCOIEMxZvaQmbmZDYa4P6Ad1IQDMzoOdjNbL+lTks503hygfUxEAjNC9Ngfl/SwJCpvERUTkcCMjsbYzexuSRV3f9nMAjUJaA8TkcCMhsFuZs9Lur7Krx6R9DVJn27mRGa2U9JOSRoZGWmhiUDzmIgEOrg0npltlvQzSZdmD62TdFbSLe7++3r/lkvjIWXsj45Udf3SeO5+QtJfzTvh65JG3f18u/cJxFbGWnhgMVaeAvOwPzpyEGyBkrtvCHVfQCzUwiMH9NiBeaiFRw4IdmAeauGRA/aKAeahFh45INiBRaiFR9kxFAMAmSHYASAzBDsAZIZgB4DMEOwAkJm2NwHr6KRm5yT9tvATt25QUi/vfdPrj1/iOej1xy+l9Rx82N2HGt0oSrCXhZlNNrOTWq56/fFLPAe9/vilcj4HDMUAQGYIdgDIDMFe377YDYis1x+/xHPQ649fKuFzwBg7AGSGHjsAZIZgb5KZPWRmbmaDsdtSJDPba2a/NrNfmdn3zWwgdpuKYGa3m9lpM3vFzMZjt6doZrbezH5uZqfM7KSZPRC7TTGYWZ+ZTZnZj2K3pRUEexPMbL2kT0k6E7stETwn6WZ3/6ik30jaHbk9XWdmfZK+I+kOSZsk3W9mm+K2qnBXJX3V3T8i6VZJX+7B50CSHpB0KnYjWkWwN+dxSQ9L6rkJCXf/qbtfnf3xqKR1MdtTkFskveLur7r7ZUn7JX02cpsK5e6/c/fjs3//s2bCraf2MjazdZLulPS92G1pFcHegJndLani7i/HbksCviTpx7EbUYBhSW/M+/lN9ViozWdmGyRtkfRi3JYU7lua6dC9G7shreJCG5LM7HlJ11f51SOSvibp08W2qFj1Hr+7/2D2No9o5uv5k0W2LRKrcqznvq1JkpmtlvSMpAfd/U+x21MUM7tL0h/c/ZiZbY3dnlYR7JLc/ZPVjpvZZkk3SHrZzKSZYYjjZnaLu/++wCZ2Va3HP8fMvijpLkm3eW/Ux74paf28n9dJOhupLdGY2QrNhPqT7n4gdnsKNibpbjP7jKS/kPQhM3vC3T8fuV1NoY69BWb2uqRRd09lQ6CuM7PbJX1T0ifc/Vzs9hTBzJZrZqL4NkkVSb+U9Dl3Pxm1YQWymZ7Mv0u64O4Pxm5PTLM99ofc/a7YbWkWY+xo5NuSPijpOTN7ycy+G7tB3TY7WfwVSYc0M2n4H70U6rPGJH1B0rbZ1/2l2d4rSoAeOwBkhh47AGSGYAeAzBDsAJAZgh0AMkOwA0BmCHYAyAzBDgCZIdgBIDP/D6hKsSLCa20iAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "\n",
    "# centroids\n",
    "x = []\n",
    "y = []\n",
    "for i in centroids:\n",
    "    x.append(i.A[0][0])\n",
    "    y.append(i.A[0][1])\n",
    "print(centroids)\n",
    "print(x, y)\n",
    "# dataMat.A\n",
    "plt.scatter(dataMat.A[:, 0], dataMat.A[:, 1])\n",
    "plt.scatter(x, y, marker='+', c='red', s=200)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
